# load libraries ----
# reporting
library(knitr)
# visualization
library(ggplot2)
library(ggthemes)
library(patchwork)
# maps
library(sf)
library(rnaturalearth)
library(maps)
# data wrangling
library(dplyr)
library(tidyr)
library(readr)
library(skimr)
library(janitor)
library(magrittr)
library(lubridate)
library(stringr)
# modelling
library(tidymodels)
library(vip)
library(randomForest)
library(doParallel)
# set other options ----
options(scipen=999)
knitr::opts_chunk$set(
tidy = FALSE,
message = FALSE,
warning = FALSE)28 Evaluate the model results
Learning Objectives
After completing this tutorial you should be able to
- visualize spatial data using a map
- evaluate model performance
Download the directory for this project here, make sure the directory is unzipped and move it to your bi328 directory. You can open the Rproj for this module either by double clicking on it which will launch Rstudio or by opening Rstudio and then using File > Open Project or by clicking on the Rproject icon in the top right of your program window and selecting Open Project.
There should be a file named 28_analyze-model-results.qmd in that project directory. Use that file to work through this tutorial - you will hand in your rendered (“knitted”) quarto file as your homework assignment. So, first thing in the YAML header, change the author to your name. You will use this quarto document to record your answers. Remember to use comments to annotate your code; at minimum you should have one comment per code set1 you may of course add as many comments as you need to be able to recall what you did. Similarly, take notes in the document as we discuss discussion/reflection questions but make sure that you go back and clean them up for “public consumption”.
1 You should do this whether you are adding code yourself or using code from our manual, even if it isn’t commented in the manual… especially when the code is already included for you, add comments to describe how the function works/what it does as we introduce it during the participatory coding session so you can refer back to it.
Before you get started, let’s make sure to read in all the packages we will need.
28.1 Data Visualization
Recall, that for this module we will ask the central question Can we predict annual average air pollution concentrations at the resolution of zip code regional levels using predictor variables describing population density, urbanization, road density, satellite pollution data, and chemical modeling data?
We have successfully built and tested a model that fulfills our requirements (yay us!).
Let’s go ahead and run through all the steps to fit that model so we are ready to go.
# specify number of cores to use
doParallel::registerDoParallel(cores = 2)
# read & wrangle data set
pm <- read_delim("data/air_pollution.csv", delim = ",") %>%
clean_names() %>%
mutate(across(c(id, fips, zcta), as.factor)) %>%
mutate(city = case_when(city == "Not in a city" ~ "Not in a city",
city != "Not in a city" ~ "In a city"))
# split sample ----
pm_split <- rsample::initial_split(data = pm, prop = 2/3)
# extract training data set
train_pm <- training(pm_split)
# extract test data set
test_pm <- testing(pm_split)
# split data for cross-validation
kfold_pm <- rsample::vfold_cv(data = train_pm, v = 4)
# ceate recipe to pre-process data set and assign variable roles
RF_rec <- recipe(train_pm) %>%
update_role(everything(), new_role = "predictor")%>%
update_role(value, new_role = "outcome")%>%
update_role(id, new_role = "id variable") %>%
update_role("fips", new_role = "county id") %>%
step_novel("state") %>%
step_string2factor("state", "county", "city") %>%
step_rm("county") %>%
step_rm("zcta") %>%
step_corr(all_numeric())%>%
step_nzv(all_numeric())
# specify model, engine, and mode and hyperparameters to tune
tune_RF <- rand_forest(mtry = tune(), min_n = tune()) %>%
set_engine("randomForest") %>%
set_mode("regression")
# specify workflow with recipe and model to be used
RF_tune_wflow <- workflows::workflow() %>%
workflows::add_recipe(RF_rec) %>%
workflows::add_model(tune_RF)
# fit workflow with cross validation and tuning
tune_RF_results <- tune_grid(object = RF_tune_wflow, resamples = kfold_pm, grid = 20)
# get performance metrics and pull out the optimal model parameters
tuned_RF_values<- select_best(tune_RF_results, "rmse")
# define workflow using optimized parameters
RF_tuned_wflow <-RF_tune_wflow %>%
tune::finalize_workflow(tuned_RF_values)If we put this in the context of science as “story-telling” our narrative is the we are able to use a set of predictor values that are generally gathered at much higher resolution compared to air pollution levels to predict annual average air pollution concentrations in areas with no or only sparse monitors with reasonable accuracy.
Let’s start by accessing a simple feature (sf) object that contains information on how to plot polygons for all countries using rnaturalearth::ne_countries().
world <- ne_countries(scale = "medium", returnclass = "sf")Here is what your maps should look like
# fill
map_color <- "khaki"
# plot world map
ggplot() +
geom_sf(data = world, color = "black", fill = map_color) We are only interested in plotting the continental US which has the following latitude/longitude bounds:
- Northern bound: 49.3457868
- Western bound: -124.7844079
- Eastern bound: -66.9513812
- Southern bound: 24.7433195
# lat/long for map extent
x_min <- -124.7844079
x_max <- -66.9513812
y_min <- 24.7433195
y_max <- 49.3457868
# set color for fill
map_color <- "khaki"
# create plot
ggplot() +
geom_sf(data = world, color = "black", fill = map_color) + # plot outline of countries
coord_sf(xlim = c(x_min, x_max),
ylim = c(y_min, y_max)) # set boundaries for mapIn a previous chapter, we accessed data sets containing state lines. For this data set we predicted air pollution at the county level, so let’s go ahead and use maps::map() to access a data set with county polygons. Because we need to make sure that the object is a simple feature (sf), we will need to use sf::st_as_sf() to convert the downloaded object to the correct format.
# download counties
counties <- maps::map("county", plot = FALSE, fill = TRUE) %>%
st_as_sf()
# object information
countiesSimple feature collection with 3076 features and 1 field
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -124.6813 ymin: 25.12993 xmax: -67.00742 ymax: 49.38323
Geodetic CRS: +proj=longlat +ellps=clrk66 +no_defs +type=crs
First 10 features:
ID geom
alabama,autauga alabama,autauga MULTIPOLYGON (((-86.50517 3...
alabama,baldwin alabama,baldwin MULTIPOLYGON (((-87.93757 3...
alabama,barbour alabama,barbour MULTIPOLYGON (((-85.42801 3...
alabama,bibb alabama,bibb MULTIPOLYGON (((-87.02083 3...
alabama,blount alabama,blount MULTIPOLYGON (((-86.9578 33...
alabama,bullock alabama,bullock MULTIPOLYGON (((-85.66866 3...
alabama,butler alabama,butler MULTIPOLYGON (((-86.8604 31...
alabama,calhoun alabama,calhoun MULTIPOLYGON (((-85.74313 3...
alabama,chambers alabama,chambers MULTIPOLYGON (((-85.59416 3...
alabama,cherokee alabama,cherokee MULTIPOLYGON (((-85.46812 3...
# create plot
ggplot() +
geom_sf(data = world, color = "black", fill = map_color) + # plot outline of countries
geom_sf(data = counties, fill = NA, color = "black") + # add county lines
geom_point(data = pm, aes(x = lon, y = lat), # add sites
size = 2, shape = 23, fill = "darkred") +
coord_sf(xlim = c(x_min, x_max),
ylim = c(y_min, y_max))This map effectively illustrates that there are distinct differences in the density of monitors and illustrates that especially in rural areas the distribution can be quite sparse. While, you would not necessarily include this figure to underscore your results, it could be effective to include it in your introduction to describe the need to create the model in the first place.
Now, let’s generate the set of maps comparing the observed and predicted levels of air pollution, i.e. the figure that we want to use to make the point that indeed, we are able to build and train a model that can be used to predict air pollution levels based on a set of predictor variables.
First we are going to need to make sure that our county data set with the information to plot county polygons and the data.frames containing the observed and predicted air pollution values can be joined, i.e. they need to have a column in common.
Let’s take a look at the two objects, both contain a column with county information but you will see that the county information is formatted differently. Use bullet points to describe the steps that you will need to take so that the values will match, suggest which functions you can use to achieve this.
First, the polygons:
head(counties)Simple feature collection with 6 features and 1 field
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -88.01778 ymin: 30.24071 xmax: -85.06131 ymax: 34.2686
Geodetic CRS: +proj=longlat +ellps=clrk66 +no_defs +type=crs
ID geom
alabama,autauga alabama,autauga MULTIPOLYGON (((-86.50517 3...
alabama,baldwin alabama,baldwin MULTIPOLYGON (((-87.93757 3...
alabama,barbour alabama,barbour MULTIPOLYGON (((-85.42801 3...
alabama,bibb alabama,bibb MULTIPOLYGON (((-87.02083 3...
alabama,blount alabama,blount MULTIPOLYGON (((-86.9578 33...
alabama,bullock alabama,bullock MULTIPOLYGON (((-85.66866 3...
Now our air pollution data:
pm %>%
pull(county) %>%
head()[1] "Baldwin" "Clay" "Colbert" "DeKalb" "Etowah" "Houston"
First, we need to separate the county and state information in the counties data set, then, we need to make the values in the county column uppercase to match the pm data set.
counties <- counties %>%
separate(ID, into = c("state", "county"), sep = ",", remove = FALSE) %>%
mutate(county = str_to_title(county))
head(counties)Simple feature collection with 6 features and 3 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -88.01778 ymin: 30.24071 xmax: -85.06131 ymax: 34.2686
Geodetic CRS: +proj=longlat +ellps=clrk66 +no_defs +type=crs
ID state county geom
alabama,autauga alabama,autauga alabama Autauga MULTIPOLYGON (((-86.50517 3...
alabama,baldwin alabama,baldwin alabama Baldwin MULTIPOLYGON (((-87.93757 3...
alabama,barbour alabama,barbour alabama Barbour MULTIPOLYGON (((-85.42801 3...
alabama,bibb alabama,bibb alabama Bibb MULTIPOLYGON (((-87.02083 3...
alabama,blount alabama,blount alabama Blount MULTIPOLYGON (((-86.9578 33...
alabama,bullock alabama,bullock alabama Bullock MULTIPOLYGON (((-85.66866 3...
Now we can use inner_join() to combine the two data sets.
map_data <- counties %>%
inner_join(pm, by = "county")We’re ready to create a map displaying the observed mean annual air pollution data for each country.
p1 <- ggplot() +
geom_sf(data = world, color = "black", fill = NA) +
geom_sf(data = map_data, aes(fill = value), color = "black") +
scale_fill_viridis_c() +
coord_sf(xlim = c(x_min, x_max),
ylim = c(y_min, y_max)) +
labs(title = "Observed mean annual PM2.5 concentration")
p1Now, let’s create the same plot but displaying the predicted values. To do this we will first need to get all the predicted values for both our training and test data sets and then join that data set with our counties data set as above to be able to plot it.
# fit model for monitors in training set
train_fit <- RF_tuned_wflow %>%
fit(data = train_pm)
# fit model for monitors in training set
test_fit <- RF_tuned_wflow %>%
fit(data = test_pm)
# predict values for monitors in training set
pred_train <- train_fit %>%
predict(train_pm) %>%
bind_cols(train_pm) %>%
select(.pred, value, fips, county, id)
# predict values for monitors in test set
pred_test <- test_fit %>%
predict(test_pm) %>%
bind_cols(test_pm) %>%
select(.pred, value, fips, county, id)
# combine data sets
pred_PM25 <- pred_test %>%
bind_rows(pred_train)
# add counties data for plotting
map_data <- counties %>%
inner_join(pred_PM25, by = "county")Now we should be able to create a plot with our predicted PM2.5 levels.
p2 <- ggplot() +
geom_sf(data = world, color = "black", fill = NA) +
geom_sf(data = map_data, aes(fill = value), color = "black") +
scale_fill_viridis_c() +
coord_sf(xlim = c(x_min, x_max),
ylim = c(y_min, y_max)) +
labs(title = "Predicted mean annual PM2.5 concentration")
p2Finally, we’ll use patchwork to create our final figure. We’ll use patchwork::plot_annotation() to add a title and legend2.
2 Note that adding \nforces a line break. Put that in your back pocket for creating good visualizations!
p1 / p2 +
plot_annotation(title = "Comparison of observed (top) and predicted (bottom) PM2.5 levels.",
subtitle = "A random forest model was trained to predict air PM2.5 levels based on predictor \nvariables including population levels, road density, development.")3 This is the typical word limits for Abstracts